#Nature's Kidneys: the Role of Wetland Reserve Easements in Restoring Water Quality
#Nicole Karwowski and Marin Skidmore
#Replication File
#2025

#Public water system facility data analysis


#Packages
library(lubridate)
library(ggplot2)
library(dplyr)
library(fixest)
library(plm)
library(ggplot2)
library(readr)
library(mapview)
library(fixest)
library(stringr)
library(stargazer)
library(ggfixest)
library(cowplot)
library(dplyr)
library(broom)
library(openxlsx)
library(sf)
library(tidyverse)
library(readxl)
library(scales)

setwd("C:/Users/n57m645/OneDrive - Montana State University/UW_Madison/Water quality")

#pws intake by huc12 information from austin wes
#Environmental Protection Agency's National Center for Environmental Economics
#data is from 2022 quarter 2
#at the huc12 level
pws_huc12 <- read_excel("Data/Water System EPA/PWS_HUC12_2022Q2_A_I_withDups.xlsx")

#right now, there is a duplicate if there is multiple intakes. 
#we will focus on facilities, not number of intakes
#get rid of the duplicates
#group by pwsid_facilityid

pws_facilities<- pws_huc12 %>% group_by(pwisd_facilityid) %>% filter(row_number()==1)
rm(pws_huc12)

#merge in the publicly available data on pwsid to get population served estimates
wss <- read_excel("Data/Water System EPA/Water System Summary_20240321.xlsx")

pws_facilities<-merge(pws_facilities, wss, by.x="PWSID", by.y="PWS ID", all.x=TRUE, all.y=FALSE)

#lets focus on facilities in the MARB
pws_facilities$HUC2<-substring(pws_facilities$HUC_12,1,2)
pws_facilities<-subset(pws_facilities, pws_facilities$HUC2 %in% c("05", "06", "07", "08", "10", "11"))

#there are 108,049 faciliites in the MRB. 

#lets focus on active facilities
pws_facilities<-subset(pws_facilities, pws_facilities$PWS_ACTIVITY_CODE=="A")
#74,833 are active. 

#number of facilities in a huc12 that are treated
load("Data/NRCS easements/Processed/easements_panel_huc_ym.RData")
ease_huc12<-subset(ease_huc12, select=c(huc12, YearMonth, RestoredEasementAcres))

ease_huc12 <- ease_huc12 %>% group_by(huc12) %>% mutate(MaxEasements=max(RestoredEasementAcres, na.rm=TRUE),
                                                        EverTreated=ifelse(MaxEasements>0,1,0))
treated_huc<-subset(ease_huc12, select=c(huc12, EverTreated))
treated_huc<-distinct(treated_huc)

#create facility id if in a treated subwatershed
pws_facilities$wetland<-ifelse(pws_facilities$HUC_12 %in% treated_huc$huc12, 1, 0)
table(pws_facilities$wetland)

#in huc12 that ever have wetland easements, there are 11,374 active facilities. 

table(pws_facilities$wetland, pws_facilities$WATER_TYPE_CODE)


#OK lets count how many systems are in each huc12.
#look at all facilities, active facilities, active facilities dependent on groundwater. 
#then do this by size (small, medium, large)
#size is determine by population served. 

pws<-subset(pws_facilities, select=c(PWSID, HUC_12, GW_SW_CODE, PWS_ACTIVITY_CODE, HUC2, wetland, `Population<br> Served Count`))
colnames(pws)<-c("pwsid", "huc12", "gw_sw", "active", "huc2", "wetland", "population")

#population categories
pws$pop_cat <- ifelse(pws$population < 10000, "small", 
                      ifelse(pws$population > 100000, "large", "medium"))
table(pws$pop_cat)

#create counts of facilities by huc12 that by gw_sw, active, and pop_cat
fc<-pws %>% group_by(huc12) %>% summarise(n_pws= n(), .groups = "drop") 

fc <- pws %>%
  group_by(huc12) %>%
  summarise(
    # Total count of facilities per huc12
    n_pws = n(), 
    # Count of active facilities
    n_pws_active = sum(active == "A"),  
    # Count of surface water facilities
    n_pws_sw = sum(gw_sw == "SW"),  
    # Count of active AND surface water facilities
    n_pws_active_sw = sum(active == "A" & gw_sw == "SW"),  
    # Count of facilities by population category
    n_pws_small = sum(pop_cat == "small"),  
    n_pws_medium = sum(pop_cat == "medium"),  
    n_pws_large = sum(pop_cat == "large"), 
    # Count of active facilities by population category
    n_pws_active_small = sum(active == "A" & pop_cat == "small"),
    n_pws_active_medium = sum(active == "A" & pop_cat == "medium"),
    n_pws_active_large = sum(active == "A" & pop_cat == "large"),
    # Count of active facilities by population category that rely on surface water
    n_pws_active_sw_small = sum(active == "A" & pop_cat == "small" & gw_sw == "SW"),
    n_pws_active_sw_medium = sum(active == "A" & pop_cat == "medium" & gw_sw == "SW"),
    n_pws_active_sw_large = sum(active == "A" & pop_cat == "large" & gw_sw == "SW"),
    .groups = "drop"
  )

fc$huc8<-substring(fc$huc12, 1,8)

fc_huc8 <- fc %>%
  group_by(huc8) %>%
  summarise(
    across(where(is.numeric), \(x) sum(x, na.rm = TRUE)),  # Use a lambda function
    .groups = "drop"
  )

#save the fc counts by huc8 for the map
setwd("C:/Users/n57m645/OneDrive - Montana State University/UW_Madison/Water quality/Data/Water System EPA")
save(fc_huc8, file="count_pws_type.RData")

